#“Income inequality is unrelated to perceived inequality and support for redistribution” by Kris-Stella Trump, forthcoming in Social Science Quarterly

# Thank you to Riley Carney for her contributions to this code
# Written in R version 4.0.2
# Last update: 09.30.2020

# The code in this file merges multiply-imputed ISSP data with SWIID 8.1 data

#load tidyverse
library(tidyverse)
library(haven)
library(mi)
library(foreign)

#Load imputed ISSP data
load("issp_imputed.RData")

#extract the imputed datasets into a list 
issp <- complete(issp_imputed, m=5)

#Load SWIID data
load("swiid8_1.rda")

#little detour: some countries did not ask these questions. go back to create an indicator for this.
addon <- read.dta("Add-on data ZA5891_v1-1-0.dta")
#first look at perceptions and preferences to confirm where questions were not asked
addon %>% 
  group_by(V7) %>%
  summarise(
    count = n(),
    mean_ceo_perc = mean(V9, na.rm=T),
    mean_ceo_pref = mean(V24, na.rm=T),
    ceo_ratio = mean(V9/V24, na.rm=T),
    mean_work_perc = mean(V11, na.rm=T),
    mean_work_pref = mean(V26, na.rm=T),
    work_ratio = mean(V11/V26, na.rm=T)
  ) %>%
  print(n=Inf)
#also look at how much was imputed
addon %>% 
  group_by(V7) %>%
  summarise(
    count = n(),
    missing_ceo_perc = sum(is.na(V9))/n(),
    missing_ceo_pref = sum(is.na(V24))/n(),
    missing_work_perc = sum(is.na(V11))/n(),
    missing_work_pref = sum(is.na(V26))/n()
  ) %>%
  print(n=Inf)

# put country-year descriptive names back into the imputed datasets
country_years <- addon %>%
  group_by(V7) %>%
  summarise(
    cy = mean(as.numeric(V7))
  ) %>%
  rename(
    cy_name = V7,
    V7 = cy
  )
issp <- issp %>% 
  map(. %>% 
        left_join(country_years, by="V7")
      )


#create a variable indicating that the salary questions were not asked
#this will be useful later, so as to not use country-years that started out with zero information and were imputed purely based on other countries
issp <- issp %>% 
  map(. %>% 
        mutate(
          missing_ratio = (cy_name=="AT_1992" | cy_name=="IT_1987")
        ))

#check that factor levels matched correctly (compare to addon summary above) and missingness variable worked
issp[[1]] %>% 
  group_by(cy_name) %>%
  summarise(
    count = n(),
    V7_mean = mean(V7),
    missingratio = mean(missing_ratio)
  ) %>%
  print(n=Inf)

#what's up with Cyprus 1999 having no missing data in the "actually earns" / "should earn" questions
CY1999 <- addon %>% 
  filter(V7=="CY_1999")
summary(CY1999$V9)
summary(CY1999$V24)
summary(CY1999$V11)
summary(CY1999$V26)
ggplot(CY1999) +
  geom_point(mapping = aes(as.numeric(V9),as.numeric(V24)))
ggplot(CY1999) +
  geom_point(mapping = aes(as.numeric(V11),as.numeric(V26)))
#looks OK. the outliers are not so strange as to seem wrong
#conclude that while it is strange that no respondents refused, I can't find an obvious issue to fix here.

#now done with some of these datasets; slight cleanup
rm("addon", "country_years", "CY1999")


#For the countries where the compensation questions were not asked, replace the variables with NA
issp <- issp %>%
  map(. %>%
        mutate(unsk_is_log = ifelse(missing_ratio==1,NA,unsk_is_log),
               unsk_ought_log = ifelse(missing_ratio==1, NA, unsk_ought_log),
               ceo_is_log = ifelse(missing_ratio==1, NA, ceo_is_log),
               ceo_ought_log = ifelse(missing_ratio==1, NA, ceo_ought_log))
  )

#Calculate desired pay ratios using logarithmic attribute of quotients
issp <- issp %>%
  map(. %>%
        mutate(unsk_ratio_log = unsk_is_log - unsk_ought_log,
               ceo_ratio_log = ceo_is_log - ceo_ought_log,
               unsk_ratio = exp(unsk_ratio_log),
               ceo_ratio = exp(ceo_ratio_log))
  )

#have a look at the data
as_tibble(issp[[1]]) %>% print(n=10, width=Inf)
as_tibble(swiid[[1]]) %>% print(n=10, width=Inf)


####Merge swiid and issp data. 

#Rename country-names in issp to match SWIID
library(countrycode)
issp <- issp %>% 
  map(. %>% mutate(cy_name = recode(cy_name, "CSFR: CZ_1992" = "CZ_1992"),
                  cy_name = recode(cy_name, "CSFR: SK_1992" = "SK_1992")
  ))

issp <- issp %>%
  map(. %>% mutate(country = as_factor(countrycode(substr(cy_name,1,2), 
                                       origin="iso2c",
                                       destination = "country.name"))))
issp <- issp %>%
  map(. %>% mutate(country = fct_recode(country, 
                                        "Czech Republic" = "Czechia"
  )))


#we will give each individual respondent inequality values that are randomly selected from the 
#100 available swiid imputations for their country-year

#first create the variables
issp <- issp %>% 
  map(. %>% 
        add_column(gini_disp = NA, gini_mkt=NA,abs_red=NA,rel_red=NA)
      )

#now randomly select one of the 100 available SWIID entries for each individual in the issp, matching on country and year
issp_swiid <- list()
set.seed(20200930)
for(k in 1:length(issp)){
  imputed_issp <- issp[[k]]
  for(i in 1:nrow(imputed_issp)){
    q <- sample(c(1:100),1,replace=T)
    sampled_ginis <- swiid[[q]] %>% 
      filter(. , country==imputed_issp[i,"country"], year==imputed_issp[i,"year"]) %>%
      select(gini_disp, gini_mkt, abs_red, rel_red)
    imputed_issp[i,] <- imputed_issp %>% 
      slice(. , i) %>%
      mutate("gini_disp"=sampled_ginis$gini_disp,
             "gini_mkt"=sampled_ginis$gini_mkt,
             "abs_red"=sampled_ginis$abs_red,
             "rel_red"=sampled_ginis$rel_red)
  }
  issp_swiid[[k]] <- imputed_issp
}

#run some sanity checks
#pick one dataset for a closer look
issp_5 <- issp_swiid[[5]]
as_tibble(issp_5) %>% print(n=10, width=Inf)
issp_5 %>%
  summarise(ginimean=mean(gini_mkt), 
            ginirange_low=range(gini_mkt)[1],
            ginirange_high=range(gini_mkt)[2],
            ginisd = sd(gini_mkt),
            ginimiss = sum(is.na(gini_mkt)))
issp_5 %>%
  summarise(ginimean=mean(gini_disp), 
            ginirange_low=range(gini_disp)[1],
            ginirange_high=range(gini_disp)[2],
            ginisd = sd(gini_disp),
            ginimiss = sum(is.na(gini_disp)))
issp_5 %>%
        group_by(country, year) %>%
        summarise(gini_mkt_mean = mean(gini_mkt)) %>%
        ggplot(.) + 
            geom_point(mapping = aes(year, gini_mkt_mean))

# a couple of final adjustments for easier analysis
#log measures of perceived inequality and preferred inequality
#create preferred and perceived inequality measures
#recode some of the key variables so that larger values indicate agreement with question wording
issp_swiid <- issp_swiid %>%
  map(. %>%
        mutate(perc_log = ceo_is_log - unsk_is_log,
               pref_log = ceo_ought_log - unsk_ought_log,
               perc = exp(perc_log),
               pref = exp(pref_log),
               toohigh = 6 - V32,
               reduce = 6 - V33,
               taxrich = 6-as.numeric(V39),
               spendless = 6-V37,
               getahead_wealthy = 6-V8
               )
  )

#Save finished dataset
save(issp_swiid, file="issp_swiid.RData")
